This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

knitr::opts_chunk$set(cache=TRUE)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(faraway) #halfnorm
library(TSA) #eacf
## 
## Attaching package: 'TSA'
## The following object is masked from 'package:readr':
## 
##     spec
## The following objects are masked from 'package:stats':
## 
##     acf, arima
## The following object is masked from 'package:utils':
## 
##     tar
library(forecast) #autoarima
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
##   method       from
##   fitted.Arima TSA 
##   plot.Arima   TSA
library(gridExtra) #grid arrange
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

setwd("/Users/Large_Data/Final_Data")
test <- read.csv(file = '202010.csv')
head(test)
glimpse(test)
## Rows: 352,106
## Columns: 110
## $ Year                            <int> 2020, 2020, 2020, 2020, 2020, 2020, 2…
## $ Quarter                         <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
## $ Month                           <int> 10, 10, 10, 10, 10, 10, 10, 10, 10, 1…
## $ DayofMonth                      <int> 8, 9, 11, 12, 13, 14, 15, 16, 18, 19,…
## $ DayOfWeek                       <int> 4, 5, 7, 1, 2, 3, 4, 5, 7, 1, 2, 3, 4…
## $ FlightDate                      <chr> "2020-10-08", "2020-10-09", "2020-10-…
## $ Reporting_Airline               <chr> "AA", "AA", "AA", "AA", "AA", "AA", "…
## $ DOT_ID_Reporting_Airline        <int> 19805, 19805, 19805, 19805, 19805, 19…
## $ IATA_CODE_Reporting_Airline     <chr> "AA", "AA", "AA", "AA", "AA", "AA", "…
## $ Tail_Number                     <chr> "N932AM", "N934AA", "N992AU", "N132AN…
## $ Flight_Number_Reporting_Airline <int> 2259, 2259, 2259, 2259, 2259, 2259, 2…
## $ OriginAirportID                 <int> 11298, 11298, 11298, 11298, 11298, 11…
## $ OriginAirportSeqID              <int> 1129806, 1129806, 1129806, 1129806, 1…
## $ OriginCityMarketID              <int> 30194, 30194, 30194, 30194, 30194, 30…
## $ Origin                          <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "D…
## $ OriginCityName                  <chr> "Dallas/Fort Worth, TX", "Dallas/Fort…
## $ OriginState                     <chr> "TX", "TX", "TX", "TX", "TX", "TX", "…
## $ OriginStateFips                 <int> 48, 48, 48, 48, 48, 48, 48, 48, 48, 4…
## $ OriginStateName                 <chr> "Texas", "Texas", "Texas", "Texas", "…
## $ OriginWac                       <int> 74, 74, 74, 74, 74, 74, 74, 74, 74, 7…
## $ DestAirportID                   <int> 14107, 14107, 14107, 14107, 14107, 14…
## $ DestAirportSeqID                <int> 1410702, 1410702, 1410702, 1410702, 1…
## $ DestCityMarketID                <int> 30466, 30466, 30466, 30466, 30466, 30…
## $ Dest                            <chr> "PHX", "PHX", "PHX", "PHX", "PHX", "P…
## $ DestCityName                    <chr> "Phoenix, AZ", "Phoenix, AZ", "Phoeni…
## $ DestState                       <chr> "AZ", "AZ", "AZ", "AZ", "AZ", "AZ", "…
## $ DestStateFips                   <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
## $ DestStateName                   <chr> "Arizona", "Arizona", "Arizona", "Ari…
## $ DestWac                         <int> 81, 81, 81, 81, 81, 81, 81, 81, 81, 8…
## $ CRSDepTime                      <int> 1430, 1430, 1430, 1430, 1430, 1430, 1…
## $ DepTime                         <int> 1432, 1449, 1428, 1432, 1432, 1429, 1…
## $ DepDelay                        <dbl> 2, 19, -2, 2, 2, -1, -2, 110, 0, 33, …
## $ DepDelayMinutes                 <dbl> 2, 19, 0, 2, 2, 0, 0, 110, 0, 33, 3, …
## $ DepDel15                        <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0…
## $ DepartureDelayGroups            <int> 0, 1, -1, 0, 0, -1, -1, 7, 0, 2, 0, 1…
## $ DepTimeBlk                      <chr> "1400-1459", "1400-1459", "1400-1459"…
## $ TaxiOut                         <dbl> 15, 20, 13, 20, 18, 16, 20, 17, 12, 1…
## $ WheelsOff                       <int> 1447, 1509, 1441, 1452, 1450, 1445, 1…
## $ WheelsOn                        <int> 1447, 1508, 1449, 1452, 1458, 1447, 1…
## $ TaxiIn                          <dbl> 5, 6, 9, 7, 10, 7, 9, 9, 6, 9, 6, 6, …
## $ CRSArrTime                      <int> 1505, 1505, 1505, 1505, 1505, 1505, 1…
## $ ArrTime                         <int> 1452, 1514, 1458, 1459, 1508, 1454, 1…
## $ ArrDelay                        <dbl> -13, 9, -7, -6, 3, -11, -1, 101, -15,…
## $ ArrDelayMinutes                 <dbl> 0, 9, 0, 0, 3, 0, 0, 101, 0, 24, 0, 1…
## $ ArrDel15                        <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0…
## $ ArrivalDelayGroups              <int> -1, 0, -1, -1, 0, -1, -1, 6, -1, 1, -…
## $ ArrTimeBlk                      <chr> "1500-1559", "1500-1559", "1500-1559"…
## $ Cancelled                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CancellationCode                <chr> "", "", "", "", "", "", "", "", "", "…
## $ Diverted                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CRSElapsedTime                  <dbl> 155, 155, 155, 155, 155, 155, 155, 15…
## $ ActualElapsedTime               <dbl> 140, 145, 150, 147, 156, 145, 156, 14…
## $ AirTime                         <dbl> 120, 119, 128, 120, 128, 122, 127, 12…
## $ Flights                         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Distance                        <dbl> 868, 868, 868, 868, 868, 868, 868, 86…
## $ DistanceGroup                   <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
## $ CarrierDelay                    <dbl> NA, NA, NA, NA, NA, NA, NA, 101, NA, …
## $ WeatherDelay                    <dbl> NA, NA, NA, NA, NA, NA, NA, 0, NA, 0,…
## $ NASDelay                        <dbl> NA, NA, NA, NA, NA, NA, NA, 0, NA, 0,…
## $ SecurityDelay                   <dbl> NA, NA, NA, NA, NA, NA, NA, 0, NA, 0,…
## $ LateAircraftDelay               <dbl> NA, NA, NA, NA, NA, NA, NA, 0, NA, 7,…
## $ FirstDepTime                    <int> NA, NA, NA, NA, NA, NA, NA, 1427, NA,…
## $ TotalAddGTime                   <dbl> NA, NA, NA, NA, NA, NA, NA, 35, NA, N…
## $ LongestAddGTime                 <dbl> NA, NA, NA, NA, NA, NA, NA, 35, NA, N…
## $ DivAirportLandings              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ DivReachedDest                  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ DivActualElapsedTime            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ DivArrDelay                     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ DivDistance                     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div1Airport                     <chr> "", "", "", "", "", "", "", "", "", "…
## $ Div1AirportID                   <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div1AirportSeqID                <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div1WheelsOn                    <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div1TotalGTime                  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div1LongestGTime                <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div1WheelsOff                   <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div1TailNum                     <chr> "", "", "", "", "", "", "", "", "", "…
## $ Div2Airport                     <chr> "", "", "", "", "", "", "", "", "", "…
## $ Div2AirportID                   <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div2AirportSeqID                <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div2WheelsOn                    <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div2TotalGTime                  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div2LongestGTime                <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div2WheelsOff                   <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div2TailNum                     <chr> "", "", "", "", "", "", "", "", "", "…
## $ Div3Airport                     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div3AirportID                   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div3AirportSeqID                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div3WheelsOn                    <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div3TotalGTime                  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div3LongestGTime                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div3WheelsOff                   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div3TailNum                     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div4Airport                     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div4AirportID                   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div4AirportSeqID                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div4WheelsOn                    <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div4TotalGTime                  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div4LongestGTime                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div4WheelsOff                   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div4TailNum                     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div5Airport                     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div5AirportID                   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div5AirportSeqID                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div5WheelsOn                    <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div5TotalGTime                  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div5LongestGTime                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div5WheelsOff                   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Div5TailNum                     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ X                               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…

Variables of our interest: FlightDate 2018-01-02 IATA_CODE_Reporting_Airline e.g. AA UA Flight_Number_Reporting_Airline 588 Origin JFK Dest SFO DepDelayMinutes <15 -> =0 ArrDelayMinutes Cancelled 0/1 Diverted 0/1

Airports: EWR Airplines: AA Overall Cancellation Delay Total 120 months

library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
path = "/Users/Large_Data/Final_Data"
setwd(path)
pattern = '*.csv'
files = list.files(path, pattern, full.names = TRUE)
ap.numflights.df <- data.frame(ORIGIN=c('EWR','JFK'), n=c(0,0))
al.numflights.df <- data.frame(IATA_CODE_REPORTING_AIRLINE=c('AA','UA'), n=c(0,0))
dep.timeblk.numflights.df <- data.frame(DEPTIMEBLK=character(),n=integer())
arr.timeblk.numflights.df <- data.frame(ARRTIMEBLK=character(),n=integer())

num.total <- NULL
num.cancel <- NULL
num.ontime <- NULL #dep delay<=15min
avg.delay <- NULL
iter<-1

for (file in files){
  # read one file at a time from files as df
  df <- fread(file=file,sep = ",", stringsAsFactors = FALSE, header = TRUE)
  names(df) <- toupper(names(df))
  if ('FL_DATE' %in% names(df)){
    df <- df[,c('OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST', 
                'DEP_DELAY_NEW', 'DEP_TIME_BLK', 'ARR_DELAY_NEW',
                'ARR_TIME_BLK', 'CANCELLED', 'DIVERTED', 'DISTANCE')]
    names(df) <- c('IATA_CODE_REPORTING_AIRLINE', 'ORIGIN', 'DEST',
                   'DEPDELAYMINUTES', 'DEPTIMEBLK', 'ARRDELAYMINUTES',
                   'ARRTIMEBLK', 'CANCELLED', 'DIVERTED', 'DISTANCE')
  }
  else
    df <- df[,c('IATA_CODE_REPORTING_AIRLINE', 'ORIGIN', 'DEST', 'DEPDELAYMINUTES', 'DEPTIMEBLK', 'ARRDELAYMINUTES', 'ARRTIMEBLK', 'CANCELLED', 'DIVERTED', 'DISTANCE')]
  
  # Overall
  num.total<-c(num.total, nrow(df))
  num.cancel<-c(num.cancel, nrow(df[df$CANCELLED==1,]))
  num.ontime<-c(num.ontime, nrow(df[(df$CANCELLED==0) & 
              (df$DIVERTED==0) &(df$DEPDELAYMINUTES<=15),]))
  avg.delay <-c(avg.delay,mean(df[(df$CANCELLED==0) & 
              (df$DIVERTED==0) & 
                (df$DEPDELAYMINUTES>15),]$DEPDELAYMINUTES))
  
  # find major airports and airlines frequence of time
  ap.numflights.df<- bind_rows(ap.numflights.df, 
    count(df,ORIGIN)) %>% 
    group_by(ORIGIN) %>% 
    summarise_all(sum)
  al.numflights.df<- bind_rows(al.numflights.df, 
    count(df,IATA_CODE_REPORTING_AIRLINE)) %>%
    group_by(IATA_CODE_REPORTING_AIRLINE) %>% 
    summarise_all(sum)
  dep.timeblk.numflights.df<- bind_rows(
    dep.timeblk.numflights.df, 
    count(df,DEPTIMEBLK)) %>% 
    group_by(DEPTIMEBLK) %>% 
    summarise_all(sum)
  arr.timeblk.numflights.df<- bind_rows(
    arr.timeblk.numflights.df, 
    count(df,ARRTIMEBLK)) %>% 
    group_by(ARRTIMEBLK) %>% 
    summarise_all(sum)
  if (iter%%10==0)
    print(paste(iter,'/',length(files)))
  iter<-iter+1
}
## [1] "10 / 120"
## [1] "20 / 120"
## [1] "30 / 120"
## [1] "40 / 120"
## [1] "50 / 120"
## [1] "60 / 120"
## [1] "70 / 120"
## [1] "80 / 120"
## [1] "90 / 120"
## [1] "100 / 120"
## [1] "110 / 120"
## [1] "120 / 120"

Airport departure delay airport arrival delay Identify number of scheduled flights departured in an airport monthly 9.75 yrs 3561 days

ap.numflights<-ap.numflights.df$n/9.75
al.numflights<-al.numflights.df$n/9.75
halfnorm(ap.numflights,labs = ap.numflights.df$ORIGIN,ylab='Num of Flights Annually',cex=0.2, nlab = 5,main='Finding Large Airports')

halfnorm(al.numflights,labs = al.numflights.df$IATA_CODE_REPORTING_AIRLINE,ylab='Num of Flights Annually',cex= 0.2,nlab = 5,main='Finding Large Airlines')

dep.timeblk.numflights.df<-dep.timeblk.numflights.df %>% 
  rename(TIMEBLK=DEPTIMEBLK) %>%
  as.data.frame()

arr.timeblk.numflights.df<-arr.timeblk.numflights.df %>% 
  rename(TIMEBLK=ARRTIMEBLK) %>%
  as.data.frame()
ggplot()+
  geom_line(data=dep.timeblk.numflights.df[-1,],aes(x=as.factor(TIMEBLK),y=log10(n),col='DEP',group=1))+
  geom_line(data=arr.timeblk.numflights.df[-1,],aes(x=as.factor(TIMEBLK),y=log10(n),col='ARR',group=1))+
  theme(axis.text.x = element_text(angle = 45), plot.title = element_text(hjust = 0.5))+
  ylab('log10(Num of Flights)')+
  xlab('Time Blocks')+
  ggtitle('Daily Variation of Num of Flights')

LAX DEN DFW ORD ATL Top 20%(76) Airports takes up more than 87% number of flights top 4.7%(18) takes up 50% of scheduled flights.

low.ap.numflights<-quantile(ap.numflights,0.9)
top_df<-ap.numflights.df[ap.numflights.df$n>=low.ap.numflights,]
sum(top_df$n/9.75)/sum(ap.numflights.df$n/9.75)
## [1] 0.948244
low.al.numflights<-quantile(al.numflights,0.9)
al.top_df<-al.numflights.df[al.numflights.df$n>=low.al.numflights,]
sum(al.top_df$n/9.75)/sum(al.numflights.df$n/9.75)
## [1] 0.9659906

Now let’s focus on these 20% airports. And calculate monthly cancellation rate, ontime rate, average delay min, average taxi out.

Overall cancel rate

cancel.list<-num.cancel/num.total
cancel.series<-ts(cancel.list,start=c(2011,1),frequency=12)
plot(cancel.series,type='o',main='Cancellation Variation',ylab='Cancellation Rate')

ontime.list<-num.ontime/num.total
ontime.series<-ts(ontime.list,start=c(2011,1),frequency=12)
plot(ontime.series,type='o',main='On-Time Rate Variation',ylab='On-Time Rate')

avgdelay.list<-avg.delay
avgdelay.series<-ts(avgdelay.list,start=c(2011,1),frequency=12)
plot(avgdelay.series,type='o',main='Average Delay Variation',ylab='Average Delay Minutes')

What happen in April 2020?? ontime and cancellation is negatively correlated.

before.pandemic.df<-data.frame(time=seq.Date(from = as.Date("2011-01-01"), 
                                to=as.Date("2020-02-01"), by = "month"),
           cancel=cancel.list[0:110],
           ontime=ontime.list[0:110],
           avgdelay=avgdelay.list[0:110])

cancel.plot<-ggplot(before.pandemic.df)+
  geom_line(aes(x=time,y=cancel,group=1))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
        plot.title = element_text(hjust = 0.5))+
  scale_x_date(date_breaks = "3 months",
               date_labels = "%Y %m") +
  xlab('')+
  ggtitle('Cancellation Rate Jan 2011 to Mar 2020')+
  ylab('Cancellation Rate')

ontime.plot<-ggplot(before.pandemic.df)+
  geom_line(aes(x=time,y=ontime,group=2))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
        plot.title = element_text(hjust = 0.5))+
  scale_x_date(date_breaks = "3 months",
               date_labels = "%Y %m") +
  ggtitle('On-Time Rate Jan 2011 to Mar 2020')+
  xlab('Time')+
  ylab('On-Time Rate')

grid.arrange(cancel.plot, ontime.plot, nrow=2)

ggplot(before.pandemic.df)+
  geom_line(aes(x=time,y=avgdelay,group=1))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
        plot.title = element_text(hjust = 0.5))+
  scale_x_date(date_breaks = "3 months",
               date_labels = "%Y %m") +
  xlab('Time')+
  ggtitle('Average Delay Mins Jan 2011 to Mar 2020')+
  ylab('Average Delay Mins')

cancel.series<-ts(cancel.list[0:110],start=c(2011,1),frequency=12)
#plot(part.cancel.series,type='o',main='Cancellation Variation Until Dec 2019',ylab='Cancellation Rate')
ontime.series<-ts(ontime.list[0:110],start=c(2011,1),frequency=12)
#plot(part.ontime.series,type='o',main='On-Time Variation Until Dec 2019',ylab='On-Time Rate')
avgdelay.series<-ts(avgdelay.list[0:110],start=c(2011,1),frequency=12)
#plot(avgdelay.series,type='o',ylab='Average Delay Minutes')
plot(decompose(cancel.series))

plot(decompose(ontime.series))

plot(decompose(avgdelay.series))

Is it stationary? No! seasonal pattern. kind of Invertible?

par(mfrow=c(2,1))
unfreq.log.cancel.series<-log(ts(cancel.series))
plot(unfreq.log.cancel.series)
acf(unfreq.log.cancel.series,lag.max = 50,main='Sample ACF of log Cancellation Rate Series')

pacf(unfreq.log.cancel.series,lag.max = 50,main='Sample PACF of log Cancellation Rate Series')

plot(diff(unfreq.log.cancel.series,12))

acf(diff(unfreq.log.cancel.series,12),lag.max = 50,main='Sample ACF of 1st Differencing log Cancellation Rate Series')
pacf(diff(unfreq.log.cancel.series,12),lag.max = 50,main='Sample PACF of 1st Differencing log Cancellation Rate Series')

auto.arima(cancel.series)
## Series: cancel.series 
## ARIMA(2,0,0)(1,0,0)[12] with non-zero mean 
## 
## Coefficients:
##          ar1      ar2    sar1    mean
##       0.5550  -0.1938  0.1681  0.0162
## s.e.  0.0971   0.1033  0.1050  0.0015
## 
## sigma^2 estimated as 7.554e-05:  log likelihood=367.62
## AIC=-725.24   AICc=-724.67   BIC=-711.74

CR: ARIMA(2,0,0)(1,0,0)[12], From first differencing plots, we have

par(mfrow=c(2,1))
unfreq.log.ontime.series<-log(ts(ontime.series))
plot(unfreq.log.ontime.series)
acf(unfreq.log.ontime.series,lag.max = 50,main='Sample ACF of log On-Time Rate Series')

pacf(unfreq.log.ontime.series,lag.max = 50,main='Sample PACF of log On-Time Rate Series')
#Seasonal differencing 
plot(diff(unfreq.log.ontime.series,12))

acf(diff(unfreq.log.ontime.series,12),lag.max = 50,main='Sample ACF of 1st Differencing log On-Time Rate Series')
pacf(diff(unfreq.log.ontime.series,12),lag.max = 50,main='Sample PACF of 1st Differencing log On-Time Rate Series')

plot(diff(diff(unfreq.log.ontime.series,12),12))
acf(diff(diff(unfreq.log.ontime.series,12),12),lag.max = 50,main='Sample ACF of 2nd Differencing log On-Time Rate Series')

pacf(diff(diff(unfreq.log.ontime.series,12),12),lag.max = 50, main='Sample PACF of 2nd Differencing log On-Time Rate Series')
auto.arima(ontime.series)
## Series: ontime.series 
## ARIMA(1,0,0)(2,1,0)[12] 
## 
## Coefficients:
##          ar1     sar1     sar2
##       0.5719  -0.5138  -0.5006
## s.e.  0.0851   0.1009   0.0938
## 
## sigma^2 estimated as 0.0007657:  log likelihood=209.63
## AIC=-411.26   AICc=-410.83   BIC=-400.92

par(mfrow=c(2,1))
unfreq.log.avgdelay.series<-log(ts(avgdelay.series))
plot(unfreq.log.avgdelay.series)
acf(unfreq.log.avgdelay.series,lag.max = 50,main='Sample ACF of log Average Delay Minutes Series')

pacf(unfreq.log.avgdelay.series,lag.max = 50,main='Sample PACF of log Average Delay Minutes Series')

plot(diff(unfreq.log.avgdelay.series,12))

acf(diff(unfreq.log.avgdelay.series,12),lag.max = 50,main='Sample ACF of 1st Differencing log Average Delay Minutes Series')
pacf(diff(unfreq.log.avgdelay.series,12),lag.max = 50,main='Sample PACF of 1st Differencing log Average Delay Minutes Series')

plot(diff(diff(unfreq.log.avgdelay.series),12))
acf(diff(diff(unfreq.log.avgdelay.series),12),lag.max = 50,main='Sample ACF of 2nd Differencing log Average Delay Minutes Series')

pacf(diff(diff(unfreq.log.avgdelay.series),12),lag.max = 50, main='Sample PACF of 2nd Differencing log Average Delay Minutes Series')
auto.arima(avgdelay.series)
## Series: avgdelay.series 
## ARIMA(0,1,2)(0,0,2)[12] with drift 
## 
## Coefficients:
##           ma1      ma2    sma1    sma2   drift
##       -0.5064  -0.4097  0.2408  0.2088  0.1353
## s.e.   0.0869   0.0870  0.0994  0.0885  0.0409
## 
## sigma^2 estimated as 10.36:  log likelihood=-281
## AIC=574   AICc=574.82   BIC=590.15

library(tseries)
adf.test(unfreq.log.cancel.series,k=12)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  unfreq.log.cancel.series
## Dickey-Fuller = -1.847, Lag order = 12, p-value = 0.6401
## alternative hypothesis: stationary
adf.test(unfreq.log.ontime.series,k=12)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  unfreq.log.ontime.series
## Dickey-Fuller = -1.7595, Lag order = 12, p-value = 0.6764
## alternative hypothesis: stationary
adf.test(unfreq.log.avgdelay.series,k=12)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  unfreq.log.avgdelay.series
## Dickey-Fuller = -1.8856, Lag order = 12, p-value = 0.6241
## alternative hypothesis: stationary
# aic bic
d<-0
for (p in 0:3)
  for (q in 0:4)
    if(p+q<6){
      cr.model<-arima(cancel.series, order=c(p,d,q), seasonal = list(order = c(1,0,0), period = 12),method = "ML")
      print(paste('p=',p,', q=',q,', AIC=',AIC(cr.model),', BIC=',BIC(cr.model)))
    }
## [1] "p= 0 , q= 0 , AIC= -700.136451777541 , BIC= -692.035010680164"
## [1] "p= 0 , q= 1 , AIC= -723.825497427258 , BIC= -713.023575964088"
## [1] "p= 0 , q= 2 , AIC= -725.543862439714 , BIC= -712.041460610752"
## [1] "p= 0 , q= 3 , AIC= -724.483953426036 , BIC= -708.281071231281"
## [1] "p= 0 , q= 4 , AIC= -722.739811987439 , BIC= -703.836449426892"
## [1] "p= 1 , q= 0 , AIC= -723.803324817326 , BIC= -713.001403354157"
## [1] "p= 1 , q= 1 , AIC= -724.207330744429 , BIC= -710.704928915466"
## [1] "p= 1 , q= 2 , AIC= -725.912955185278 , BIC= -709.710072990524"
## [1] "p= 1 , q= 3 , AIC= -723.025774095118 , BIC= -704.122411534571"
## [1] "p= 1 , q= 4 , AIC= -722.199986853368 , BIC= -700.596143927028"
## [1] "p= 2 , q= 0 , AIC= -725.243010829426 , BIC= -711.740609000464"
## [1] "p= 2 , q= 1 , AIC= -723.347142494487 , BIC= -707.144260299733"
## [1] "p= 2 , q= 2 , AIC= -724.025030640364 , BIC= -705.121668079817"
## [1] "p= 2 , q= 3 , AIC= -724.947207421689 , BIC= -703.343364495349"
## [1] "p= 3 , q= 0 , AIC= -723.445538144685 , BIC= -707.242655949931"
## [1] "p= 3 , q= 1 , AIC= -722.431283235484 , BIC= -703.527920674937"
## [1] "p= 3 , q= 2 , AIC= -723.887611909099 , BIC= -702.28376898276"

Residual analysis

cancel.model <- arima(cancel.series, order=c(2,0,0),seasonal = list(order = c(1,0,0), period = 12), method="ML")
cancel.residual <- residuals(cancel.model)
plot(cancel.residual, main=expression("Residuals of CR model ARIMA(2,0,0)(1,0,0)[12]"), ylab='Residuals')
abline(h=0)

ontime.model <- arima(ontime.series, order=c(1,0,0),seasonal = list(order = c(2,1,0), period = 12), method="ML")
ontime.residual <- residuals(ontime.model)
plot(ontime.residual, main=expression("Residuals of OTR model ARIMA(1,0,0)(2,1,0)[12]"), ylab='Residuals')
abline(h=0)

avgdelay.model <- arima(avgdelay.series, order=c(0,1,2), seasonal = list(order = c(0,0,2), period = 12), method="ML")
avgdelay.residual <- residuals(avgdelay.model)
plot(avgdelay.residual, main=expression("Residuals of ADM model ARIMA(0,1,2)(0,0,2)[12]"), ylab='Residuals')
abline(h=0)

LB.test(cancel.model)
## 
##  Box-Ljung test
## 
## data:  residuals from  cancel.model
## X-squared = 7.5826, df = 9, p-value = 0.5767
LB.test(ontime.model)
## 
##  Box-Ljung test
## 
## data:  residuals from  ontime.model
## X-squared = 12.835, df = 9, p-value = 0.1702
LB.test(avgdelay.model)
## 
##  Box-Ljung test
## 
## data:  residuals from  avgdelay.model
## X-squared = 3.9267, df = 8, p-value = 0.8637
qqnorm(cancel.residual)
qqline(cancel.residual)

shapiro.test(cancel.residual)
## 
##  Shapiro-Wilk normality test
## 
## data:  cancel.residual
## W = 0.90547, p-value = 9.682e-07
shapiro.test(ontime.residual)
## 
##  Shapiro-Wilk normality test
## 
## data:  ontime.residual
## W = 0.98201, p-value = 0.1446
shapiro.test(avgdelay.residual)
## 
##  Shapiro-Wilk normality test
## 
## data:  avgdelay.residual
## W = 0.98142, p-value = 0.1287

Predict

predict(cancel.model, n.ahead=10)
## $pred
##             Mar        Apr        May        Jun        Jul        Aug
## 2020 0.01295315 0.01710937 0.01745776 0.01732542 0.01685408 0.01632351
##             Sep        Oct        Nov        Dec
## 2020 0.01620622 0.01481093 0.01469579 0.01501457
## 
## $se
##              Mar         Apr         May         Jun         Jul         Aug
## 2020 0.008531662 0.009757690 0.009806255 0.009813489 0.009821557 0.009822670
##              Sep         Oct         Nov         Dec
## 2020 0.009822671 0.009822706 0.009822718 0.009822718
plot(forecast(auto.arima(cancel.series)), main = expression("Forecast ARIMA(2,0,0)(1,0,0)[12]"))
lines(ts(cancel.list[110:120],start=c(2020,2),frequency=12),col='red')

plot(forecast(auto.arima(ontime.series)), main = expression("Forecast ARIMA(1,0,0)(2,1,0)[12]"))
lines(ts(ontime.list[110:120],start=c(2020,2),frequency=12),col='red')

plot(forecast(auto.arima(avgdelay.series)), main = expression("Forecast ARIMA(0,1,2)(0,0,2)[12]"))
lines(ts(avgdelay.list[110:120],start=c(2020,2),frequency=12),col='red')

pandemic series

pand.cancel.series<-ts(cancel.list[110:120],start=c(2020,03),frequency=12)
plot(pand.cancel.series)

pand.ontime.series<-ts(ontime.list[110:120],start=c(2020,03),frequency=12)
plot(pand.ontime.series)

pand.avgdelay.series<-ts(avgdelay.list[91:115],start=c(2018,07),frequency=12)
plot(pand.avgdelay.series)

auto.arima(pand.avgdelay.series)
## Series: pand.avgdelay.series 
## ARIMA(1,1,0)(0,1,0)[12] 
## 
## Coefficients:
##           ar1
##       -0.5998
## s.e.   0.2498
## 
## sigma^2 estimated as 18.56:  log likelihood=-34.25
## AIC=72.5   AICc=73.84   BIC=73.47
new.cancel.list<-cancel.list
new.cancel.list[111]<-mean(new.cancel.list[0:110])
new.cancel.list[112]<-mean(new.cancel.list[0:110])
new.cancel.series<-ts(new.cancel.list[0:115],start=c(2011,01),frequency=12)

new.ontime.list<-ontime.list
new.ontime.list[111]<-mean(new.ontime.list[0:110])
new.ontime.list[112]<-mean(new.ontime.list[0:110])
new.ontime.series<-ts(new.ontime.list[0:115],start=c(2011,01),frequency=12)
plot(forecast(auto.arima(new.cancel.series),h=5), main = expression("Forecast ARIMA(2,0,0)(1,0,0)[12]"))
lines(ts(new.cancel.list[110:120],start=c(2020,2),frequency=12),col='red')

plot(forecast(auto.arima(new.ontime.series),h=5), main = expression("Forecast ARIMA(1,0,0)(2,1,0)[12]"))
lines(ts(new.ontime.list[110:120],start=c(2020,2),frequency=12),col='red')

avgdelay.series.extended<-ts(avgdelay.list[0:115],start=c(2011,1),frequency=12)
plot(forecast(auto.arima(pand.avgdelay.series),h=5), main = expression("Forecast ARIMA(1,1,0)(0,1,0)[12]"))
lines(ts(avgdelay.list[110:120],start=c(2020,2),frequency=12),col='red')

Nonstationary Cancellation rate: We can find p is between 0-3 and q is between 0-4. Decide by AIC and BIC find both minimal

Average delay minutes has an upwarding trend. non stationary residual flights are independent ADF unit root of AIC BIC